*Get life-time hourly wages*
clear
use "Data\sample1_new.dta"

drop year1 year2 year3 year4 year5 year9 year10 year11 start_wage_ambition ten_wage_ambition old_ambition individual_wage_growth_ambition extreme_ambition wage10_ambition_temp n_wage10_ambition_group wage_start_ambition_temp n_wage_start_ambition_group wage10_ambition wage_start_ambition n_wage10_individual_ambition n_wage_start_individual_ambition wage10_mean_ambition wage_start_mean_ambition wage_growth_ambition

*Save and merge on later
save "Data\temp.dta", replace

*use "Data\temp.dta"

*identify those we observe for 30 years after graduation
gen temp=1 if final_educ_year==aar
bysort pnr: egen temp2=max(temp)
gen year1=.
replace year1=final_educ_year if final_educ_year>=1980 & final_educ_year!=. & temp2==1
drop temp temp2
gen year30=year1+29 if year1!=.
gen temp=1 if aar==year30 & year1!=.
bysort pnr: egen obs_for_30=max(temp)
drop temp
gen year_since_grad=aar-year1+1
replace year_since_grad=. if year_since_grad<1 | year_since_grad>30

**Get life time wages

*drop more variables to speed things up
keep final_educ pnr aar log_wage year_since_grad obs_for_30 individual grad_region


*before going to program level
gen wage_deflated=.
gen life_wage=.

sort final_educ pnr aar

*copy missing final_educ to output folder
frame copy default temp 
frame change temp
keep if final_educ==1
frame copy temp output
frame change default
drop if final_educ==1
frame drop temp

frame copy default temp 
frame change temp
keep if final_educ==.
frame change output
frameappend temp
frame change default
drop if final_educ==.
frame drop temp

*check if some programs have too few people for 30 years. We want at least 30 individuals for meaningful regressions
by final_educ: egen temp=sum(obs_for_30) if individual==1
by final_educ: egen temp2=max(temp)

frame copy default temp 
frame change temp
keep if temp2<30
drop temp
frame change output
frameappend temp
frame change default
drop if temp2<30
drop temp temp2
frame drop temp


*Program level

levelsof final_educ, matrow(programs)

mat dir /*dimensions are  programs[497,1]*/

/*Loop takes around 24 hours to run*/

forvalues j=1(1)497{

frame copy default temp 
frame change temp
keep if final_educ==programs[`j',1]

qui reg log_wage ib2000.aar i.year_since_grad if obs_for_30==1 & final_educ==programs[`j',1]

gen temp=0 if aar==2000 & obs_for_30==1


local p: colfullnames e(b)

foreach i of local p{
	
if substr("`i'",5,1)=="." {
	replace temp=_b[`=substr("`i'",1,4)'.aar] if aar==`=substr("`i'",1,4)' & obs_for_30==1
	}
}


replace wage_deflated=log_wage-temp if !missing(year_since_grad) & obs_for_30==1 & final_educ==programs[`j',1]

drop temp

bysort pnr: egen temp=sum(wage_deflated) if final_educ==programs[`j',1]

replace life_wage=temp
replace life_wage=. if life_wage==0

drop temp

frame change output
frameappend temp
frame change default
drop if final_educ==programs[`j',1]
frame drop temp

}

frame change output
drop temp2

*Save and continue from here*
save "Data\temp2.dta", replace

*use "Data\temp2.dta"

*Group by final_educ*
sort final_educ

*get group variables

by final_educ: egen temp=mean(life_wage) if final_educ!=. & final_educ!=1 & individual==1  
by final_educ: egen p_life_wage=max(temp)
drop temp



*Fix problem with some large "out-dated" educational programs
**OBS: Many of these steps do not assign anything because less than 30 people are observed to enter with these programs and then stay on for 30 years

forvalues i=81(1)85{
	
*9th grade
gen temp=p_life_wage if final_educ==1109`i'
egen temp_2=max(temp)
replace p_life_wage=temp_2 if (final_educ==1007 | final_educ==1008 | final_educ==1023 | final_educ==1123 | final_educ==1009 | final_educ==1022) & grad_region==`i' 
drop temp temp_2

replace final_educ=1107`i' if final_educ==1107 & grad_region==`i'
replace final_educ=1008`i' if final_educ==1008 & grad_region==`i'
replace final_educ=1023`i' if final_educ==1023 & grad_region==`i'
replace final_educ=1123`i' if final_educ==1123 & grad_region==`i'
replace final_educ=1009`i' if final_educ==1009 & grad_region==`i'
replace final_educ=1022`i' if final_educ==1022 & grad_region==`i'

*10th grade
gen temp=p_life_wage if final_educ==1110`i'
egen temp_2=max(temp)
replace p_life_wage=temp_2 if final_educ==1010 & grad_region==`i' 
drop temp temp_2

replace final_educ=1010`i' if final_educ==1010 & grad_region==`i'

}

*3.g 
gen temp=p_life_wage if final_educ==1198
egen temp_2=max(temp)
replace p_life_wage=temp_2 if final_educ==1097
drop temp temp_2


**kmeans**
*standardize variables
sum p_life_wage
sca the_mean_s=r(mean)
sca the_sd_s=r(sd)
gen p_life_wage_s=(p_life_wage-the_mean_s)/the_sd_s
sum p_life_wage_s


cluster kmeans p_life_wage_s, k(4) name(life_wage_ambition) s(kr(1234))
tab life_wage_ambition

tabstat p_life_wage_s, by(life_wage_ambition)

*SAVE*
save "Data\life_wage_ambition.dta", replace

**Get final version**

*use "Data\life_wage_ambition.dta"


**

merge 1:1 pnr aar using "Data\temp.dta", keepusing(civst aegte_nr faelle_nr koen age)

drop _merge

*Define marital status
gen relationship=0
replace relationship=1 if civst=="G" | (civst!="G" & faelle_nr!="")

gen married=0
replace married=1 if civst=="G"

gen cohab=0
replace cohab=1 if relationship==1 & married==0

*Make couple ID based on the PNR of the man
gen couple_id="."
replace couple_id=pnr if koen=="1" & relationship==1
replace couple_id=aegte_nr if koen=="2" & married==1
replace couple_id=faelle_nr if koen=="2" & cohab==1
sort couple_id aar

*Keep only couples where we observe both partners
gen temp2=koen if relationship==1
destring temp2, replace
by couple_id aar: egen temp3=mean(temp2)
keep if (temp3>1 & temp3<2 & relationship==1) | relationship==0
drop temp2 temp3

//generate partners' age

by couple_id aar, sort: gen age_male = age if koen=="1"
by couple_id aar: gen age_female = age if koen=="2"
by couple_id aar: egen maxage = max(age_male)
by couple_id aar: replace age_male = maxage
drop maxage
by couple_id aar: egen maxage = max(age_female)
by couple_id aar: replace age_female = maxage
drop maxage

*Age restriction
keep if ((age_male>=19 & age_male<=60) & (age_female>=19 & age_female<=60) & relationship==1) | (age>=19 & age<=60 & relationship==0)
*zero deleted

*Only couples
keep if relationship==1


drop individual 

sort pnr aar

save "Data\life_wage_ambition_final.dta", replace
